package getGaokaoXuankeData.controller;

import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.usermodel.WorkbookFactory;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class GetWord {
    private static  String global_url = "http://10.6.150.162:8020/";
    private static int current_num = 1;//读取Excel中的值
    //private static String excel_path01 = "G:\\desk temp\\202006\\工作\\西语大纲词汇-整理.xls";
    private static String excel_path01 = "F:\\file\\202006\\西语大纲词汇-整理.xls";
    private static int hangshu = 0;
    private static HSSFWorkbook wb = new HSSFWorkbook();//创建HSSFWorkbook对象
    private static HSSFSheet sheet=wb.createSheet("单词001");//建立sheet对象
    //private static List<String>  li_ju =  new ArrayList<String>();


    public static void main(String[] args) throws IOException,NullPointerException {

        //获取Excel中的单词3139
        for (int i=0;i<3139;i++){
            String word = getExcelVal(i).trim();
            //获取网页中的内容,并进行匹配
            System.out.println("当前单词："+word);
            if (word != null && word != ""){
                getHtmlVal(word);
            }
        }
        //输出Excel文件
        Long time = System.currentTimeMillis();
        FileOutputStream output=new FileOutputStream("F:\\单词"+time+".xls");
        wb.write(output);
        output.flush();
    }

    //获取网页中的内容
    private static String getHtmlVal(String word) throws IOException {
        System.out.println("开始比较");
        String[] urls = {
                global_url+"pages/chouxiang-A1-A2..html"//抽象A
                ,global_url+"pages/juti-A1-A2..html"//具体A
                ,global_url+"pages/chouxiang-B1-B2..html"//抽象B
                ,global_url+"pages/juti-B1-B2..html"//具体B
        };
        for (int i=0;i<urls.length;i++){
            System.out.println("第"+i+"网页");
            Document document = Jsoup.connect(urls[i]).get();//获取对应url的值
            Elements elements = document.select("table");//获取table
            Integer table_len = elements.size();//获取table的数量
            for (int j=0;j<table_len;j++){
                System.out.println("第"+j+"个table");
                Elements elements_word = elements.get(j).select("tbody").select("tr");//获取实际的内容
                Elements elements_td = elements_word.select("td");
                int elements_td_len = elements_td.size();//如果有，则每行只有两个tr标签
                for (int a=0;a<elements_td_len;a++){
                    Elements elements_li = elements_td.get(a).select("li");//获取li标签
                    System.out.println("li标签的数量："+elements_li.size());
                    int li_len = elements_li.size();
                    for (int k=0;k<li_len;k++){
                        //按照顺序获取每一个li标签的内容
                        String elements_li_str = elements_li.get(k).text();
                        HashMap<String,Object> map = searchWord(word,elements_li_str);//这里需要修改
                        boolean bool = (boolean) map.get("flag");
                        if (bool){
                            Elements elements_title = elements.get(j).select("caption");
                            //将值转换为text
                            String str_title = elements_title.text();
                            //去掉数字
                            boolean num_b = isContainNumber(str_title);//判断是否包含数字
                            if (num_b)
                                str_title = str_title.replaceAll("[\\d.]","");
                            //去掉链接内容
                            boolean zkh_b = isContain_zkh(str_title);
                            if (zkh_b){
                                int n = str_title.indexOf("[");
                                str_title = str_title.substring(0,n);
                            }
                            //System.out.println("标题："+str_title);
                            //获取分类AB
                            Elements elements_fenlei = elements.get(j).select("thead").select("tr").select("th");
                            String fenlei = elements_fenlei.get(a).text();
                            String lj_text = map.get("liju").toString();//转为String
                            write_excel(word,str_title,fenlei,lj_text,urls[i]);//写入excel
                        }
                    }

                }

            }
        }


        return "ok";
    }

    //写入excel
    private static void write_excel(String word,String title,String fenlei,String lj_text,String url){
        System.out.println("单词："+word);
        System.out.println("单词title："+title);
        System.out.println("单词fenlei："+fenlei);
        System.out.println("单词lj_text："+lj_text);
        System.out.println("单词url："+url);
        HSSFRow row1=sheet.createRow(hangshu); //在sheet里创建第一行，参数为行索引
        row1.createCell(0).setCellValue(word);//单词
        row1.createCell(1).setCellValue(fenlei);//分类
        row1.createCell(2).setCellValue(title);//title
        row1.createCell(3).setCellValue(lj_text);//分类
        row1.createCell(4).setCellValue(url);
        hangshu++;
    }
    //处理url
    private static String get_url_type(int i){
        String type = "抽象";
        if (i>1){
            type = "具体";
        }
        return type;
    }
    //查询,regEx为待匹配的单词，all_word为待遍历的文本，String regEx,String all_word
    public static HashMap searchWord(String regEx,String li_str){

        System.out.println("传过来的："+li_str);
        HashMap<String,Object> map = new HashMap<String,Object>();
        //Pattern pattern = Pattern.compile("[^\\w]"+regEx+"[^\\w]");
        Pattern pattern = Pattern.compile("\\b"+regEx+"\\b");
        Matcher matcher = pattern.matcher(li_str);
        boolean result = false;
        result= matcher.find();
        if (result){
            map.put("liju",li_str);
            map.put("flag",result);
        }else{
            map.put("liju","");
            map.put("flag",result);
        }
        return map;
    }


    //读取Excel中的值
    private static String getExcelVal(int num) throws IOException,NullPointerException {
        File excel = new File(excel_path01);
        Workbook workbook = WorkbookFactory.create(excel);
        Sheet sheet1 = workbook.getSheetAt(0);
        Row row1 = sheet1.getRow(num);
        //System.out.println(row1.getCell(0));
        String word = row1.getCell(0).toString();
        if (word == null || word == ""){
            return null;
        }else{
            return row1.getCell(0).toString();
        }
    }

    //判断是否包含数字
    public static boolean isContainNumber(String company) {
        Pattern p = Pattern.compile("[0-9]");
        Matcher m = p.matcher(company);
        if (m.find()) {
            return true;
        }
        return false;
    }

    //判断是否包含“[”字符
    private static  boolean isContain_zkh(String str){
        if (str.contains("["))
            return true;
        else
            return false;
    }

}
